#  Here is another example of looking at two variables,
#   x and y, where we think that there might be a linear
#   relation between the variables.

# In this case we are given the values for x and y
#  x = 31.9 15.6 18.2 17.3 28.2 20.6 29.5 11.8 25.1 29.2
#  y = 28.5 13.6 18.0 15.7 28.4 22.6 21.6 11.6 19.2 21.5
# We need to put these into our environment
x <- c(31.9,15.6,18.2,17.3,28.2,20.6,29.5,11.8,25.1,29.2)
y <- c(28.5,13.6,18.0,15.7,28.4,22.6,21.6,11.6,19.2,21.5)

# then, a simple plot suggests that there is a linear
#  relation between these two sets of values

plot( x, y )

# we can spruce up that plot by making some adjustments
plot(x, y,
     xlim=c(0,40),   # sets our x range from 0 to 40
     ylim=c(0,30),   # sets our y range from 0 to 30
     xaxp=c(0,40,8), # sets our x tick marks to 
                     #   0 to 40 with 8 intervals
     yaxp=c(0,30,6), # sets our y tick marks to
                     #   0 to 30 with 6 intervals
     col="black",    # selects BLACK as the color of
                     #   our points
     pch=16)         # makes our points appear as 
                     #   large black dots
             # give the command ?points to see all the
             # choices for the dots

#  then, we can provide some backgrund for our graph.

# we will put horizontal and vertical dashed 
#   dark blue lines even with the tick marks

abline( h=seq(0,30,5), v=seq(0,40,5), lty="dashed",
        col="darkblue")

# and we can make the axes stand out with black
#  lines that are a little wider than normal

abline(h=0,v=0, col="black", lw=2)


# as a small aid ro reading our plot, 
# let us draw light vertical lines 
# at each of our x-values
abline( v=x, col="orange")

#  just to review the values that we have
x
y

# It will be a bit better if we can see these 
# pairs of values in a vertical setting

# put these in a dataframe 
our_view <- data.frame(x,y)
our_view

#  Looking at the points in the graph, I think
#  that a line going through (0,10) and (40,30)
#  might be a good representation of the relation
#  between x and y.

#  We can draw such a line on the graph if we know
#  the intercept and the slope.
#  The intercept is at (0,10) and the slope is 
#   20/40 or 1/2
abline( 10, 1/2, col="blue", lw=2)

#  We have specific x values.  We want to find the
#  expected y-value for each of those known 
#  x-values.

#  To do this we just use the equation
#   y = 10 + (1/2)*x
#  with each of the x values that we have 
expect_1 <- 10+(1/2)*x
expect_1

#we can append those to our dataframe
our_view$expect_1 <- expect_1
our_view


# and we can add those expected values to our plot
points( x, expect_1, col="blue", pch=0)

# Now we want to find how far off the observed
# values are from their corresponding expected
# values, i.e., find the observed - expected
our_view$diff_1 <- y - expect_1
our_view

# now find the square of the observed - expected
our_view$sqr_diff_1 <- our_view$diff_1^2
our_view

#then add up all of those squared differences
#  we could use sum() but we will use cumsum() 
#   to get this
our_view$total_1 <- cumsum( our_view$sqr_diff_1)
our_view
#  so our sum of the squared differences between the
#  observed y values and the expected y-values
#  is 107.08

# That was interesting, but can we find a line
#  that is a better representation of the linear
#  relation between x and y.  We will know if it
#  better if it produces a lower sum of the squared
#  differences between the observed and the 
#  expected values.

#  let us try another line...
#  how about the line through (0,0) and (35,30)

#  that give the intercept at (0,0) and the slope is 
#   30/35 or 6/7
abline( 0, 6/7, col="red", lw=2)
#  On that line the expected values, 
#  using y=0 + (6/7)*x are

expect_2 <- 0+(6/7)*x
expect_2
# let us create a second datafrome for this
our_view_2 <- data.frame( x, y, expect_2)

our_view_2
# add those expected values to our plot
points( x, expect_2, col="darkred",
        bg="red", pch=23)
# then find the observed - expected
our_view_2$diff_2 <- y - expect_2
our_view_2

# now find the square of the observed - expected
our_view_2$sqr_diff_2 <- our_view_2$diff_2^2
our_view_2

#then add up all of those squared differences
#  we could use sum() but we will use cumsum() 
#   to get this
our_view_2$total_2 <- cumsum( our_view_2$sqr_diff_2)
our_view_2
#  so our sum of the squared differences between the
#  observed y values and the expected y-values
#  is 83.821837

# Therefore, our second line, the red one, has 
#   a lower sum of the squared differences 
#   between the observed values and the expected
#   values than the sum of the squared differences
#   for the first line, the blue one.
#   That makes the blue line a better fit
#   for our data.

################

# we could try other lines that we might think up,
#  but the question is, what is the 
#  equation of the best line?

# to find this we use the lm() command to get our 
# linear model.

lm_hold <- lm( y ~ x )
lm_hold
# this tells us that the best fitting line has
#  an intercept of 3.924 and a slope of 0.710.
#  That is, the best line is 
#     y = 3.924 + 0.710*x

# because we saved our linear model we can draw
# that line on our graph 
abline( lm_hold, col="green", lw=2)

# There is actually a lot more to our model than
# just the intercept and the slope.  We can see some 
# of this by using the summary() function.

summary( lm_hold )

# In that output we find both the intercept
# and the slope, but with more decimal places.

#  We can even pull the intercept and the slope
#  out of the model to see even more decimal
#  places.  We do this with the coefficients()
#  function.

coeffs <- coefficients( lm_hold )
coeffs

# In particular, then, 
#     coeffs[1] holds the intercept
# and coeffs[2] holds the slope.

# Let us go through the same process that we 
# did above to find the sum of the squared 
# differences between observed and expected
# values using these new intercept and slope
# values.

our_view_3 <- data.frame( x, y )

# get our new expected values
our_view_3$expect_3 <- coeffs[1] + coeffs[2]*x
our_view_3

# then the observed - expected values
our_view_3$diff_3 <- y - our_view_3$expect_3
our_view_3

# then find the squared differences
our_view_3$sqrd_diff <- our_view_3$diff_3^2
our_view_3
# 
# Then get the sum of the squared differences

our_view_3$cum_sum <- cumsum( our_view_3$sqrd_diff)
our_view_3

# that sum, 71.119232, is lower than was the  
# sum for our red line.  Not only is this a 
# better fit line this is the best fit line!

# Before we leave this look at our linear
# regression there is one more topic.  Above we 
# saw that we could find the difference between
# the observed values, our y-values, and the
# expected values, those that we put into
# our_view_3$expect_3.  Above we stored those
# differences in our_view_3$diff_3.  

# we have a special name for such differences.
#  They are called the residuals.  We can 
#  calculate the residuals, as we did above.
#  For example, in our data the sixth pair of
#  x,y values is (20.6, 22.6).  Thus, when
#  x is 20.6 the observed y value is 22.6.
#  When x is 20.6, the expected value is
coeffs[1] + coeffs[2] * 20.6

# which means that the residual value when x=20.6
#  is
22.6 - (coeffs[1] + coeffs[2] * 20.6)

# of course, wee calculated that before as shown
# in the sixth line of
our_view_3

# However, since we stored our linear model
# in the variable lm_hold we can use the function
# residuals() to pull all of the residual values
# out of the model.
residuals( lm_hold )

# It helpful to get the residual values because
# we want to look at a plot of the x-values and
# the residual values.  We want to see that the
# points in that plot are all over the place, not
# forming some sort of pattern. With only 10 points
# in our data it is hard to see that these are 
# spread out and not in a pattern.

plot( x, residuals( lm_hold) )


######################
# Now, we have seen that we can find the best
# fit regression line. What we want now is some
# measure of just how good a fit that is.

# we do that with the correlation coefficient.
cor( x, y )


#The correlation coefficient is always a value
# between -1 and 1.  The square of the 
# correlation coefficient tells us the percent
# of the variability we see in our data that is
# "explained" by the linear regression model.

# in our example this is
0.8682721^2

# so our linear model,  y = 3.924 + 0.710*x,
# explains just over 75% of the variations seen 
# in our data.

#
#  To be sure, we actually generated
#  the x and y values above using
source("../gnrnd4.R")
gnrnd4( key1=1345920906, key2=6120650913, key3=23000100 )


# Let us generate a larger set of data and then
# quickly run through doing a plot, getting the
# coefficients of the regression equation,
# getting and plotting the residuals, then then 
# finding the correlation coefficient.

gnrnd4( key1=1345922306, key2=4185651309, key3=23000100 )
plot( L1, L2 )

#clean up that plot
plot( L1, L2, 
      xlim=c(0,40),   # sets our x range from 0 to 40
      ylim=c(0,60),   # sets our y range from 0 to 30
      xaxp=c(0,40,8), # sets our x tick marks to 
      #   0 to 40 with 8 intervals
      yaxp=c(0,60,12), # sets our y tick marks to
      #   0 to 60 with 12 intervals
      col="black",    # selects BLACK as the color of
                      #   our points
      pch=16,         # makes our points appear as
                     #   large black dots 
      las=1,          #makes our axis values appear
                      # in a horizontal alignment
      cex.axis=0.5    #scales the axis values to 1/2 
                      # their usual size
                      )
# we will put horizontal and vertical dashed 
#   dark blue lines even with the tick marks

abline( h=seq(0,60,5), v=seq(0,40,5), lty="dashed",
        col="darkblue")

# and we can make the axes stand out with black
#  lines that are a little wider than normal

abline(h=0,v=0, col="black", lw=2)


my_line <- lm( L2 ~ L1 )
my_line
# so our regression equation is
#   y = 63.986   +    -1.521*x
abline( my_line )
plot( L1, residuals( my_line ))
cor( L1, L2 )
cor( L1, L2 )^2
# so our model explains about 93.6% of the 
#  variability of the data

# you might note that had we done 
summary( my_line )
# that 0.9365 appears as the
#  Multiple R-squared:  0.9365